{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = pd.read_excel('./disease.xlsx').fillna('NONE')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "d_name = d['name'].values.tolist()\n",
    "d_alias = d['alias'].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "d_a_d = []\n",
    "temp = '{0}又称为{1}'\n",
    "disease_alias_add_data = []\n",
    "for idx, i in enumerate(d_name):\n",
    "    if d_alias[idx] != 'NONE':\n",
    "        line = d_alias[idx].replace(' ', '')\n",
    "        alias_list = []\n",
    "        for v in alias_vovab:\n",
    "            if len(v) > 1 and v in line and v != i:\n",
    "                alias_list.append(v)\n",
    "                new_line = line.replace(v, '<e2>' + v + '</e2>')\n",
    "                new_line = temp.format('<e1>' + i + '</e1>',new_line)\n",
    "                disease_alias_add_data.append('别名' + '\\t' + new_line)\n",
    "        for idx, i in enumerate(alias_list):\n",
    "            for jdx, j in enumerate(alias_list[idx + 1:]):\n",
    "                new_line = line.replace(i, '<e1>'+i+'</e1>').replace(j, '<e2>'+j+'</e2>')\n",
    "                new_line = temp.format(i, new_line)\n",
    "                disease_alias_add_data.append('unknown'+ '\\t' +new_line)\n",
    "#                 new_item = item[:idx] + ['<e1>'+i+'</e1>'] + item[idx+1:][:jdx] + ['<e2>'+j+'</e2>'] + item[idx+1:][jdx+1:]\n",
    "#                 ali = '、'.join(new_item)\n",
    "#                 food_alias_add_data.append('unknow'+'\\t'+temp.format(key , ali))\n",
    "#         d[i] = d_alias[idx].split(',')\n",
    "#         print(d_alias[idx])\n",
    "#     d[i] = d_alias[idx].split(',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "alias_vovab = []\n",
    "with open('./alias_vocab.txt') as f:\n",
    "    for i in f.readlines():\n",
    "        alias_vovab.append(i.strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['滑石粉致尘肺',\n",
       " '成人皮硬化病',\n",
       " '伤寒两感症',\n",
       " '室性早搏',\n",
       " '肾著',\n",
       " '锥形虫症',\n",
       " '痛性瘀斑综合征',\n",
       " 'ALS',\n",
       " '胎儿型软骨营养障碍',\n",
       " '吸烟斑',\n",
       " '老年人糖尿病性乳酸性酸中毒',\n",
       " '眶底骨折',\n",
       " '过渡脊椎',\n",
       " '妊娠合并生殖道沙眼衣原体感染',\n",
       " '三B征',\n",
       " '黄体后期心境恶劣障碍',\n",
       " '小儿智力迟钝',\n",
       " '结核性子宫颈炎',\n",
       " '食管化学灼伤',\n",
       " '肺伴有嗜酸细胞增多肺浸润',\n",
       " '大肠杆菌肺炎',\n",
       " '结核性子宫内膜炎',\n",
       " '成人肥厚性幽门狭窄',\n",
       " '小儿莱特尔氏综合征',\n",
       " '尤因肉瘤',\n",
       " '特发性食管黏膜剥脱症',\n",
       " '明细胞棘皮瘤',\n",
       " '小儿脑性低钠血症',\n",
       " '藏毛囊肿',\n",
       " '小儿家族非溶血性黄疸',\n",
       " '吕弗琉综合征',\n",
       " '非典型脊椎结核',\n",
       " '全面性发作癫痫',\n",
       " '骨痹',\n",
       " '小儿Engelman病',\n",
       " '急性焦虑症',\n",
       " '动脉性痣',\n",
       " '颅内静脉窦闭塞性颅内高血压',\n",
       " '酒精性神经病变',\n",
       " '绀红皮病',\n",
       " '跖骨行军骨折',\n",
       " '男性乳癌',\n",
       " '新生儿臀部红斑',\n",
       " '先天性内斜视',\n",
       " '肝纤维变性',\n",
       " '水壶腹综合征',\n",
       " '枕骨大孔疝',\n",
       " '肺包虫囊肿',\n",
       " '无β-脂蛋白血症',\n",
       " '惊恐障碍',\n",
       " '非甲非乙型肝炎病毒感染与肾小球性肾炎',\n",
       " '噪声性耳外伤',\n",
       " '灰皮病',\n",
       " '小儿神经白塞病',\n",
       " '陶瓷样胆囊病',\n",
       " '承霤病',\n",
       " '小儿迟发性低丙种球蛋白血症',\n",
       " '膜增殖性肾小球肾炎',\n",
       " '婴儿性手足搐搦症',\n",
       " '小儿甲旁低',\n",
       " '美洲锥虫病',\n",
       " '外阴纤维神经瘤',\n",
       " '宫颈早期间质浸润癌',\n",
       " '小肠血管发育不良',\n",
       " '性连锁遗传寻常鱼鳞病',\n",
       " '骨膜发育不良',\n",
       " '慢性溃疡',\n",
       " '乙型肝炎-肾炎',\n",
       " '小儿乙型肝炎免疫复合物肾炎',\n",
       " '鞘磷脂沉积病',\n",
       " '躁狂抑郁症',\n",
       " '遗传性慢性肾炎',\n",
       " '小儿Klippel-Feil综合征',\n",
       " '小儿史-约综合症',\n",
       " '医院坏死',\n",
       " '颅后窝室管膜细胞瘤',\n",
       " '慢性肥厚性外阴炎',\n",
       " '隐匿性肾小球肾炎',\n",
       " '中隔支阻滞',\n",
       " '皮肤黏膜透明蛋白变性',\n",
       " '小儿甲基丙二酸尿症',\n",
       " '未分化网织细胞肉瘤',\n",
       " '新月体肾小球肾炎',\n",
       " '卵巢子宫内膜异位囊肿破裂',\n",
       " '小儿秽语综合征',\n",
       " '克汀病',\n",
       " '小儿累若纳综合征',\n",
       " '包皮包茎阴茎',\n",
       " '痃癖',\n",
       " 'ASPS',\n",
       " '小儿Klippel-Trenaunay综合征',\n",
       " '先天性血管萎缩性皮肤异色病',\n",
       " '后牙错合畸形',\n",
       " '横产式',\n",
       " '新生儿柯萨基病毒B组感染',\n",
       " '巴林格尔病',\n",
       " '直肠子宫内膜异位',\n",
       " '获得性免疫功能丧失症病人的急性阑尾炎',\n",
       " '遗传梅毒',\n",
       " '结节性淋巴组织样增生',\n",
       " '矿工肘',\n",
       " '非溃疡性角膜炎',\n",
       " '足分支菌病',\n",
       " '轮状病毒性胃肠炎',\n",
       " '胎盘早期剥离',\n",
       " '药物中毒所致的精神障碍',\n",
       " '老年人Hamman-Rich综合征',\n",
       " '甲旁亢',\n",
       " '自发性面神经麻痹',\n",
       " 'Graefe综合征',\n",
       " '春季角膜结膜炎',\n",
       " '卡氏肺孢子虫肺炎',\n",
       " '血小板机能不全',\n",
       " '糖尿病合并血糖过低',\n",
       " '黄瘤症',\n",
       " '阴道卵黄囊瘤',\n",
       " '疱疹性湿疹',\n",
       " '更年期关节炎',\n",
       " '痛痹',\n",
       " '尺侧屈腕肌损伤',\n",
       " '旋螺突起',\n",
       " '幼年型慢性关节炎及其伴发的色素膜炎',\n",
       " 'Bazin病',\n",
       " '肉芽肿性唇炎',\n",
       " '痛性压迫性足部丘疹',\n",
       " '桥本甲状腺炎',\n",
       " '军团杆菌性肺炎',\n",
       " '子宫切除后盆底疝',\n",
       " '短食管',\n",
       " '睾丸突出',\n",
       " '腊泪样骨病',\n",
       " '瘟黄',\n",
       " '新生儿剥脱样皮炎',\n",
       " '假麻痹性重症肌无力',\n",
       " '男性假两性同体',\n",
       " '缺汗症',\n",
       " '卵巢内胚窦瘤',\n",
       " '小儿Marie-Sainton综合征',\n",
       " '严重急性呼吸综合征',\n",
       " 'Ⅲ度宫颈糜烂',\n",
       " '肺动脉栓塞综合征',\n",
       " '小儿胸腺不发育',\n",
       " '肥疮',\n",
       " '小儿Tourette综合征',\n",
       " '精神发育迟缓',\n",
       " '老年脊髓压迫症',\n",
       " '暑湿感冒',\n",
       " '良性颅内高血压',\n",
       " '氏病',\n",
       " '小儿两性同体',\n",
       " '老年上腔静脉受压综合征',\n",
       " '严重性外阴阴道炎',\n",
       " '肝间叶性错构瘤',\n",
       " '动脉粥样硬化性脑血栓',\n",
       " '假性黑色棘皮症',\n",
       " '全身中毒性休克',\n",
       " '透明细胞汗腺瘤',\n",
       " '软垂疣',\n",
       " '小儿成人透明膜病',\n",
       " '青年性驼背',\n",
       " '老年人特异性心肌病',\n",
       " '复发性卵巢恶性瘤',\n",
       " '小儿智力迟延发育',\n",
       " '克尔病',\n",
       " '恶性结节性汗腺瘤',\n",
       " '肺胀',\n",
       " '重复肾输尿管',\n",
       " '左心发育不良综合征',\n",
       " '颌下间隙急性化脓性感染',\n",
       " '西部马脑炎',\n",
       " '房早',\n",
       " '蕈样霉菌病',\n",
       " '系统性变应性血管炎',\n",
       " '淋球菌性关节炎',\n",
       " '维生素A缺乏性眼病',\n",
       " '对口疮',\n",
       " '淀粉贮积病',\n",
       " '先天性乙醇综合征',\n",
       " '韦尔综合征',\n",
       " '前交叉韧带损伤',\n",
       " '石淋',\n",
       " '角膜鳞状上皮细胞癌',\n",
       " '足球踝',\n",
       " '斑状白斑病',\n",
       " '法沙吉尔氏神经痛',\n",
       " '眼眶骨纤维异常增殖',\n",
       " '肠源性肢端皮炎综合征',\n",
       " '左心室恶性Schwann瘤',\n",
       " '毛囊皮脂腺粘蛋白沉积症',\n",
       " '喉部挫伤',\n",
       " '老年胃食管反流病',\n",
       " '变形杆菌性肺炎',\n",
       " '小儿吉福德氏综合征',\n",
       " '原发性急进性肾小球肾炎',\n",
       " '山菲利普综合征',\n",
       " '腱鞘炎',\n",
       " '无症状性蛋白尿及血尿',\n",
       " '胶耳',\n",
       " '囊性肾脏病',\n",
       " '儿童期脑性巨人畸形综合征',\n",
       " '肾性失盐综合征',\n",
       " '蓝神经痣',\n",
       " '丝绸之路病',\n",
       " '脑-面血管瘤病',\n",
       " '自发性大网膜梗死',\n",
       " 'Fisher综合征',\n",
       " '小儿路易斯·巴尔综合征',\n",
       " '慢性萎缩性肾盂肾炎',\n",
       " '先天性高位肩胛症畸形',\n",
       " '4对咽囊综合征',\n",
       " '高免疫球蛋白E综合症',\n",
       " '妊娠女性阑尾脓肿',\n",
       " '贝切特氏病性巩膜炎',\n",
       " '无痛性心肌梗塞',\n",
       " '肝着',\n",
       " '锁骨下动脉窃血综合征',\n",
       " '凡科尼综合征',\n",
       " '角膜带状变性',\n",
       " '威蓝氏病',\n",
       " '小儿青花鱼中毒',\n",
       " '小儿先天性全色盲',\n",
       " '胆囊炎',\n",
       " '新生儿天疱疮',\n",
       " '老年性关节炎',\n",
       " '肾痨',\n",
       " '歪脖儿',\n",
       " '丑角样鱼鳞病',\n",
       " '赤脉如缕',\n",
       " '比-桑综合征',\n",
       " '血管炎前期综合症',\n",
       " '血癌',\n",
       " '甘露糖苷过多症',\n",
       " '小儿动脉瘤样骨性囊肿',\n",
       " '水泡状胎',\n",
       " '小儿Russell-Silver综合征',\n",
       " '肠绞痛',\n",
       " '毛发糠疹',\n",
       " '脉络膜视网膜炎',\n",
       " 'Bowen-Lee-Zellweger综合征',\n",
       " '小儿FanconiⅡ综合征',\n",
       " '屈曲(旋转)型骨折脱位',\n",
       " '海绵肾',\n",
       " '流火病',\n",
       " '急性多灶性缺血性脉络膜病变',\n",
       " '或感染性心内膜炎',\n",
       " '等孢子球虫病',\n",
       " '血栓症',\n",
       " '小儿蓝贾第鞭毛虫病',\n",
       " '风痧',\n",
       " '急性脂肪性硬皮病',\n",
       " '创伤性食管黏膜表层管型剥脱',\n",
       " '晶状体蛋白性青光眼',\n",
       " '外阴内胚层窦瘤',\n",
       " '新生儿急性坏死性小肠结肠炎',\n",
       " '儿童型肺结核',\n",
       " '胃淋巴瘤',\n",
       " '外阴软垂疣',\n",
       " '烟酸缺乏病',\n",
       " '小儿血管网织内皮瘤',\n",
       " '毛细血管扩张性肉芽肿',\n",
       " '小儿爱-唐综合征',\n",
       " '自发性大量腹腔内出血',\n",
       " '脓疱性疱疹样皮炎',\n",
       " '后极部色素膜出血综合征',\n",
       " '趾底总神经瘤',\n",
       " '不全型或顿挫型狼疮',\n",
       " '小儿脑水肿',\n",
       " '脉络膜视网膜环状萎缩',\n",
       " '特发性高钙尿',\n",
       " '甲亢性肝病',\n",
       " '婴儿骨皮质增生症',\n",
       " '血管网状内皮瘤',\n",
       " '广泛视网膜前牵拉',\n",
       " '皮下脂肪萎缩',\n",
       " '脊髓神经纤维瘤',\n",
       " '头部棘细胞癌',\n",
       " '吸气性气道阻塞综合征',\n",
       " '钙化性胆囊',\n",
       " '心脏衰弱',\n",
       " '心内膜炎',\n",
       " '间质角膜炎-眩晕-神经性耳聋综合征',\n",
       " '布罗迪骨脓肿',\n",
       " '血小板第3因子缺陷症',\n",
       " '打摆子',\n",
       " '外阴部的皮肤或粘膜发炎',\n",
       " '食管腐蚀伤',\n",
       " '颈痈',\n",
       " '原发性胃淋巴瘤',\n",
       " '胃假性淋巴瘤',\n",
       " '卡他微球菌感染',\n",
       " '维生素D缺乏神经病变',\n",
       " '西蒙氏症',\n",
       " '槟榔肝',\n",
       " '白口疮',\n",
       " '金葡肺',\n",
       " '霉菌性关节炎',\n",
       " '小儿变态反应性紫癜',\n",
       " '地图状脉络膜视网膜炎',\n",
       " '桡骨棒状手',\n",
       " '囊发等',\n",
       " '不稳定心绞痛',\n",
       " '淀粉样心肌病',\n",
       " '钩口线虫病',\n",
       " '高血压病',\n",
       " '外阴粒性成肌细胞瘤',\n",
       " '角膜间质炎',\n",
       " '美洲山岭蜱热',\n",
       " '室管膜细胞瘤',\n",
       " '肱骨小头骨骺分离',\n",
       " '类癌瘤综合征',\n",
       " '软组织假恶性骨肿瘤',\n",
       " '原发性卵巢比-桑二氏综合征',\n",
       " '喉憩室',\n",
       " '成人Wissler综合征',\n",
       " '小儿流感',\n",
       " '小儿肾血管高血压',\n",
       " '先天性摇椅足',\n",
       " '肺型血吸虫病',\n",
       " '小儿原发性甲状旁腺机能亢进',\n",
       " '压力性气胸',\n",
       " '洁癖症',\n",
       " '类固醇抵抗型哮喘',\n",
       " '肠系膜脂性肉芽肿',\n",
       " '后部多形性营养不良',\n",
       " '急性心梗',\n",
       " '高原心脏病',\n",
       " '早产贫血',\n",
       " '慢性病性贫血',\n",
       " '妊娠合并糖尿病',\n",
       " '小儿特发性高钙尿症',\n",
       " '枕骨大孔区脑膜瘤',\n",
       " '甲减',\n",
       " '慢性萎缩性肢端皮炎',\n",
       " '春季卡他性结膜炎',\n",
       " '指拐垫',\n",
       " '诺卡氏菌性巩膜炎',\n",
       " '创伤窒息综合症',\n",
       " '威尔逊变性',\n",
       " '老年泌尿系感染',\n",
       " '动静脉瘘',\n",
       " '脂肪纤维瘤',\n",
       " '迁延性肺嗜酸性粒细胞浸润症',\n",
       " '尿毒症肺',\n",
       " '假性胰腺囊肿',\n",
       " '舞蹈病-棘红细胞增多症',\n",
       " '老年性色素斑(痣)',\n",
       " '寒害',\n",
       " '粘多糖病Ⅴ型',\n",
       " '小儿Bloch-Siemens综合征',\n",
       " '会阴撕裂',\n",
       " '库蒂斯综合征',\n",
       " '小儿成人型呼吸窘迫综合征',\n",
       " '发育性静脉异常',\n",
       " '局限性非特异性眼眶炎症综合征',\n",
       " '新生儿呼吸困难综合征',\n",
       " '多脏器衰竭',\n",
       " '寂静型甲状腺炎',\n",
       " '粉刺',\n",
       " '人格解体障碍',\n",
       " '藏毛窦和藏毛囊肿',\n",
       " '厚皮性骨膜病',\n",
       " '蜈蚣螫',\n",
       " '粘多糖贮积病Ⅱ型',\n",
       " '褐黄症',\n",
       " '左房恶性粘液瘤',\n",
       " '内风',\n",
       " '儿童高血压',\n",
       " '老年尿崩症',\n",
       " '播散性毛囊性皮肤结核',\n",
       " '嗜酸细胞性肺炎',\n",
       " 'Canada综合征',\n",
       " '老年退行性心瓣膜病',\n",
       " '类球孢子菌病',\n",
       " '房性期外收缩',\n",
       " '外伤性斜颈',\n",
       " '小儿Anderson-Fabry综合征',\n",
       " '胃肠综合征',\n",
       " '马斯热',\n",
       " '放射性皮肤炎',\n",
       " '家族性秃发',\n",
       " '婴儿型髋内翻',\n",
       " '小儿假性甲状旁腺机能减退综合征',\n",
       " '新生儿剥脱性皮炎',\n",
       " '显微镜下多发性血管炎',\n",
       " '脂肪瘤',\n",
       " '网织细胞增殖症',\n",
       " '肺感染',\n",
       " '烂喉丹痧',\n",
       " '小儿冯·威利布兰德病',\n",
       " '蓝氏贾第鞭毛虫病',\n",
       " '肉芽肿性肠炎',\n",
       " '血脂过多',\n",
       " '小儿Menkes钢毛综合征',\n",
       " '卵巢功能缺如综合症',\n",
       " '北亚蜱传斑疹伤寒',\n",
       " '无汗',\n",
       " '自发心室纤维性颤动',\n",
       " '小儿肾病变综合征',\n",
       " '小儿高安动脉炎',\n",
       " '小儿无脉病',\n",
       " '瘰疬性苔藓',\n",
       " '小儿慢性肾炎',\n",
       " '单侧透明肺',\n",
       " '毛细胞性白血病',\n",
       " '角膜带状混浊',\n",
       " '赤(扁)虫病',\n",
       " '血管性水肿',\n",
       " '小儿低血糖症',\n",
       " '汉特综合征',\n",
       " '硅肺病',\n",
       " '振掉',\n",
       " '远侧肾小管酸中毒',\n",
       " '新生儿低血糖伴内脏肥大-巨舌-小脑综合征',\n",
       " '进行性肢端色素沉着症',\n",
       " '鼠型斑疹伤寒',\n",
       " '老花眼',\n",
       " '小儿高氨血症',\n",
       " '尿布湿疹',\n",
       " '瘿痈',\n",
       " '小儿周期性呕吐',\n",
       " '淋巴管性水肿',\n",
       " '小儿颜面-听-脊柱异常',\n",
       " '小儿病毒相关吞噬红细胞综合征',\n",
       " '肾乳头坏死',\n",
       " 'Warthin瘤',\n",
       " '小儿Pierre-Robin综合征',\n",
       " '菜农皮炎',\n",
       " '小儿慢性再生低下性中性粒细胞减少症',\n",
       " '中颅窝硬脑脊膜瘤',\n",
       " '女性膀胱颈部挛缩',\n",
       " '老年人心脏钙化综合征',\n",
       " '恶性透明细胞末端螺旋腺瘤',\n",
       " '粘膜炎奈瑟菌感染',\n",
       " '非对称性室间隔肥厚',\n",
       " '先天性中胚层发育不良',\n",
       " '脓性蜂窝组织炎性咽峡炎',\n",
       " '肥厚梗阻型心肌病',\n",
       " '全身性红斑狼疮伴发的精神障碍',\n",
       " '儿童丘疹性皮炎',\n",
       " '胡须顽湿',\n",
       " '淋巴瘤样肉芽肿病',\n",
       " '移行椎',\n",
       " '肋锁综合征',\n",
       " '牛肉绦虫病',\n",
       " '双食管畸形',\n",
       " '小儿急性感染性多发性神经炎',\n",
       " '小儿Bartter综合征',\n",
       " '发热性溃疡性痤疮',\n",
       " '幼年孤独癖',\n",
       " '蛇头疖',\n",
       " '石末沉着病',\n",
       " '肾细胞癌',\n",
       " '羊癫疯',\n",
       " '小儿伴血小板减少和湿疹的免疫缺陷',\n",
       " '后颅窝血肿',\n",
       " '东方马脑炎',\n",
       " '小儿13号染色体三体型综合征',\n",
       " '小儿Lignac综合征',\n",
       " '手足发绀',\n",
       " '老年脑血栓',\n",
       " '儿童Tourette综合征',\n",
       " '全身性皮肤搔痒病',\n",
       " '外阴纤维上皮性息肉',\n",
       " '外阴急性蜂窝组织炎症',\n",
       " '纵隔水囊肿与心包憩室',\n",
       " '肝肿瘤',\n",
       " '白大衣性高血压',\n",
       " '边缘溃疡',\n",
       " '色素性毛痣',\n",
       " '小儿竹刀鱼毒素中毒',\n",
       " '寿斑',\n",
       " '婴幼儿单体7综合征',\n",
       " '小儿儿童真性红细胞增多症',\n",
       " '儿童血栓性血小板减少性紫癜',\n",
       " '子宫纤维瘤',\n",
       " '立夫特山谷热',\n",
       " '子宫炎',\n",
       " '汗疱疹',\n",
       " '老年炎症性肠病',\n",
       " '小儿Caffey病',\n",
       " '小儿碳氧血红蛋白血症',\n",
       " '家族性混合性高脂血症',\n",
       " '拉草人螨',\n",
       " '酒毒性神经病',\n",
       " '莱米尔综合征',\n",
       " '晶状体过敏性眼内炎继发性青光眼',\n",
       " '小儿石骨症',\n",
       " '早老性皮质纹状体变性',\n",
       " 'I型糖尿病',\n",
       " 'SPFS盆底痉挛综合征',\n",
       " '小儿家族性低磷血症',\n",
       " '小儿抗磷脂综合症',\n",
       " '耳癣',\n",
       " '青少年类风湿性关节炎',\n",
       " '会阴Ⅲ度裂伤',\n",
       " '新生儿化脑',\n",
       " '蛀牙',\n",
       " '纵隔囊肿及肿瘤',\n",
       " '胰腺结石',\n",
       " '遗传性硬化性皮肤异色病',\n",
       " '牙周疾病',\n",
       " 'Cheek-Perry综合征',\n",
       " '小儿性腺发育障碍症',\n",
       " '轴周性硬化性脑炎',\n",
       " '老年多发性骨髓瘤',\n",
       " '眠弦赤烂',\n",
       " '南欧斑疹热',\n",
       " '缺血性心脏病',\n",
       " '廔疮',\n",
       " '硬纤维增殖性纤维瘤',\n",
       " '儿童型女性乳房肥大症',\n",
       " '老年人',\n",
       " 'Moyamoya病',\n",
       " '嗅觉减退',\n",
       " '匐行性脉络膜视网膜炎',\n",
       " '腿风',\n",
       " '伤痉',\n",
       " '精索恶性新生物',\n",
       " '多杀巴斯德菌感染',\n",
       " '遗传性高铁成红细胞性贫血',\n",
       " '老年心肌梗死',\n",
       " '急性病毒性心包炎',\n",
       " '咽喉病毒感染伴肠系膜及腹膜后淋巴结炎',\n",
       " '子宫颈管妊娠',\n",
       " '急性链球性皮肤坏死',\n",
       " '瘰疠',\n",
       " '克拉贝病',\n",
       " '老年血管性痴呆',\n",
       " '特发性热带脾肿大',\n",
       " '性交困难',\n",
       " '小儿Ⅱ型肾小管酸中毒',\n",
       " '酒精性衰退状态',\n",
       " '成人型T细胞性白血病',\n",
       " '足穿通性溃疡',\n",
       " '汗疹症',\n",
       " '小儿单纯性腺发育不全',\n",
       " '三尖瓣下移畸形',\n",
       " '老年支气管哮喘',\n",
       " '假性痛风',\n",
       " '肝上皮样血管内皮瘤',\n",
       " '慢性粘连性心包炎',\n",
       " '原发性法娄皮欧氏癌',\n",
       " '肝阳上逆',\n",
       " '糖尿病性类脂质渐进性坏死',\n",
       " '关节石病',\n",
       " '结核性肠系膜淋巴结炎',\n",
       " '新生儿巨细胞包涵体病',\n",
       " '脉络膜转移性肿瘤',\n",
       " '阿狄森病',\n",
       " '脑底异常血管网',\n",
       " '住肉孢子虫病',\n",
       " '囊样肠积气',\n",
       " '柯鲁病',\n",
       " '乳腺增生症',\n",
       " '小儿雷克林霍曾氏病',\n",
       " '小儿关节松弛症',\n",
       " '慢性输卵管炎',\n",
       " '急性乙型病毒性肝炎',\n",
       " '血奈热',\n",
       " '颊颏口周皮肤红变',\n",
       " '原发性弥漫性萎缩',\n",
       " '线瘊',\n",
       " '蝶骨硬脑膜肉瘤的眼眶病变',\n",
       " '小儿硬脑膜肉瘤',\n",
       " '阵发性房性心动过速',\n",
       " '腊特克瘤',\n",
       " '广泛视网膜周围增殖',\n",
       " '特发性面神经麻痹',\n",
       " '胎敛疮',\n",
       " '小儿Weber综合征',\n",
       " '脂溢性疣',\n",
       " '双束阻滞',\n",
       " '风牵出睑',\n",
       " '棘颚口线虫病',\n",
       " '壁冠状动脉',\n",
       " '先天愚症',\n",
       " '老年人特发性肺纤维变性',\n",
       " 'Roemheld综合征',\n",
       " '小儿获得性免疫缺陷综合症',\n",
       " '慢性闭角青光眼',\n",
       " '新生儿一过性脓疱性黑变病',\n",
       " '睾丸旁肿瘤',\n",
       " '耶尔森菌病',\n",
       " '亚硫酸酐中毒',\n",
       " '老年高脂血症',\n",
       " '蛇串疮',\n",
       " '包皮阴茎头炎',\n",
       " '焦虑性神经症',\n",
       " '肺不张病',\n",
       " '小儿成神经细胞瘤',\n",
       " '偏侧投掷症',\n",
       " '老年泌尿系统感染',\n",
       " '小儿肉样瘤病',\n",
       " '妊娠合并尿结石',\n",
       " '侵袭性指趾息肉状腺癌',\n",
       " '膀胱横纹肌肉瘤',\n",
       " '印度痘',\n",
       " '热痱红色粟粒疹',\n",
       " '膀胱印戒细胞癌',\n",
       " '小儿心面综合症',\n",
       " '攻击性婴儿纤维瘤病',\n",
       " '老年急性胰腺炎',\n",
       " '腹壁溶血性链球菌坏疽',\n",
       " '嗜麦芽窄食单胞菌肺炎',\n",
       " '闷气生',\n",
       " '舒张期心衰',\n",
       " '柯萨奇湿疹',\n",
       " '自身致敏性皮炎',\n",
       " '多结节性甲状腺肿伴甲亢',\n",
       " '慢性肠系膜血管供血不足',\n",
       " '小儿骨疣',\n",
       " '艾迪生病',\n",
       " '小儿房扑',\n",
       " '乳腺真皮淋巴癌病',\n",
       " '睾丸女性化综合征',\n",
       " '腹腔皮下综合征',\n",
       " '瘙痒性紫癜',\n",
       " '包皮嵌顿',\n",
       " '鼻炎',\n",
       " '儿童特殊发育障碍',\n",
       " '下肢深静脉血栓',\n",
       " '皮质类固醇性多发性肌病',\n",
       " '痤疮样皮疹',\n",
       " '小儿变应性亚败血症综合症',\n",
       " '小儿严重过敏反应',\n",
       " '老年播散性血管内凝血',\n",
       " '急性感染后肾小球肾炎',\n",
       " '黑蒙性白痴',\n",
       " '老年人结脑',\n",
       " '脑内上皮样瘤',\n",
       " '牙槽骨炎',\n",
       " '地中海贫血视网膜病变',\n",
       " '食物诱发哮喘',\n",
       " '鳃裂囊肿和瘘',\n",
       " '遗传性舞蹈病',\n",
       " '黏膜红斑病',\n",
       " '胆囊管部分阻塞综合征',\n",
       " '皮质下动脉硬化性脑病',\n",
       " '肾小管特发性扩张',\n",
       " '子宫直肠窝脓肿',\n",
       " '甲状腺毒性黏蛋白沉积症',\n",
       " '双束支阻滞',\n",
       " '肾动脉静脉瘘',\n",
       " '毛细血管扩张痣',\n",
       " '原发性视网膜脱离',\n",
       " '胎热',\n",
       " '卵巢过度刺激综合症',\n",
       " '精囊恶性新生物',\n",
       " '断耳疮',\n",
       " '杂色卟啉病',\n",
       " '先天性输尿管末端功能性梗阻',\n",
       " '结石性胆囊炎',\n",
       " '卡伯希水痘样疹',\n",
       " '炎性肠病',\n",
       " '过敏性肉芽肿症',\n",
       " '开放性脐尿管',\n",
       " '间隔支阻滞',\n",
       " '心包旁囊肿与心包憩室',\n",
       " '老年结肠息肉',\n",
       " '无睾畸形',\n",
       " '结核性甲状腺炎',\n",
       " '单纯性肺嗜酸细胞浸润症',\n",
       " '睾丸消失综合征',\n",
       " '毛真菌病',\n",
       " '小儿成人型低丙种球蛋白血症',\n",
       " '干皮肤',\n",
       " '胆小管性肝炎',\n",
       " '左前半分支阻滞',\n",
       " '色素性绒毛滑膜炎',\n",
       " '鼻部坏疽性肉芽肿',\n",
       " '杜克斯病',\n",
       " '蝶骨硬脑脊膜肉瘤的眼眶病变',\n",
       " '小梁性癌',\n",
       " '老年甲状腺癌',\n",
       " '急性局灶性细菌性肾炎',\n",
       " '假癌性软疣',\n",
       " '小儿药物超敏反应',\n",
       " '急性胃黏膜病变',\n",
       " '髌骨骨骺炎',\n",
       " '神经鞘磷脂沉积病',\n",
       " '小儿恶性甲状腺肿',\n",
       " '老年自发性气胸',\n",
       " 'Ⅰ度宫颈糜烂',\n",
       " '老年钙化性心瓣膜病',\n",
       " '小儿智力落后',\n",
       " '赖利-戴综合症',\n",
       " '肌收缩性头痛',\n",
       " '蠕形螨',\n",
       " '囊胞肾',\n",
       " '肾血管平滑肌脂肪瘤',\n",
       " '阴茎增殖性红斑',\n",
       " '梅累内氏坏疽',\n",
       " '伤后脂肪栓塞症',\n",
       " '老年人肠应激综合征',\n",
       " '胆囊运动障碍综合征',\n",
       " '葡萄疫',\n",
       " '角膜扁平细胞癌',\n",
       " 'Morton病',\n",
       " '急性间歇性卟啉症',\n",
       " '小儿耳-脊椎综合征',\n",
       " '呆小病',\n",
       " '突聋',\n",
       " '不典型(寂静)亚急性甲状腺炎',\n",
       " '淋巴肉瘤细胞白血病',\n",
       " '青年上肢远端肌萎缩',\n",
       " '粒-单核细胞型白血病',\n",
       " '蛇盘疮',\n",
       " '小儿充血性肺不张',\n",
       " '舌菌',\n",
       " '间叶性软骨肉瘤',\n",
       " '小儿过敏性胃肠炎',\n",
       " '胰腺恶性病变综合征',\n",
       " '青少年糖尿病',\n",
       " '近视',\n",
       " '急性宫颈炎',\n",
       " '黄体囊肿破裂',\n",
       " '前斜角肌综合征',\n",
       " '原发性单克隆丙种球蛋白病',\n",
       " '日光性弹力纤维综合征',\n",
       " '脑动静脉瘤',\n",
       " '外阴乳头状上皮瘤',\n",
       " '四咽囊综合征',\n",
       " '全身性神经节苷脂贮积症',\n",
       " '脐尿管未闭',\n",
       " '外阴汗管囊腺瘤',\n",
       " '疝脱',\n",
       " '指甲下外生骨疣',\n",
       " '小儿脑性瘫痪',\n",
       " '杨梅疮',\n",
       " '轴旁性桡侧半肢畸形',\n",
       " '小儿良性复发性无菌性脑膜炎',\n",
       " '红细胞肝性原卟啉症',\n",
       " '急性病毒性心肌炎',\n",
       " '颈椎后纵韧带骨化',\n",
       " '口疮等',\n",
       " '椎体破裂性骨折',\n",
       " '围绝经期功能障碍性子宫出血',\n",
       " '尖圭湿疣',\n",
       " '乳酸性酸中毒',\n",
       " '肠道腺病毒感染',\n",
       " '斯-韦二氏综合征',\n",
       " '头皮毛发瘤',\n",
       " '经腹会阴直肠癌根治术后盆底疝',\n",
       " '多中心性网状内皮系统组织细胞瘤病',\n",
       " '蔗尘沉着病',\n",
       " '丹毒样乳腺癌',\n",
       " '虹膜新血管化',\n",
       " 'cell',\n",
       " '胃穿孔',\n",
       " '醉酒',\n",
       " '支气管扩张症',\n",
       " '儿童期肺外结核',\n",
       " '肠气囊肿病',\n",
       " '第三',\n",
       " '小儿斯-约二氏综合征',\n",
       " '特发性扭转痉挛',\n",
       " '脑胶质细胞瘤',\n",
       " '新生儿荨麻疹',\n",
       " '腹膜后纤维变性',\n",
       " '小儿伯克氏肉样瘤',\n",
       " '地中海弛张热',\n",
       " '肠侵袭性埃希氏大肠杆菌感染',\n",
       " '睾丸淋巴腺瘤',\n",
       " '对称性点状和网状白斑病',\n",
       " '甲脆折',\n",
       " '外阴白斑',\n",
       " '磨工病',\n",
       " '恶性淋巴瘤',\n",
       " '皲裂伤口',\n",
       " '莱特勒西韦综合征',\n",
       " '血吸虫感染与肝胆疾病',\n",
       " '肺十二指肠虫病',\n",
       " '急性卡他性结膜炎',\n",
       " '囊型包虫病',\n",
       " '食管瘅',\n",
       " '局限性浅表性萎缩性硬皮病',\n",
       " '局限性亲表皮性网状细胞增生症',\n",
       " '葡萄糖-6-磷酸酶缺陷症',\n",
       " '老年食管癌',\n",
       " '经病发热',\n",
       " '小儿Dubin-Sprinz综合征',\n",
       " '肾石病',\n",
       " '库鲁症',\n",
       " '第六病',\n",
       " '晕船',\n",
       " '小儿四联畸型',\n",
       " '梦生',\n",
       " '小儿Horner综合征',\n",
       " '皮下组织浅深静脉的进行性坏疽',\n",
       " '人类微小病毒感染',\n",
       " '胆囊胆固醇沉着病',\n",
       " '小儿肾上腺生殖器综合征',\n",
       " '小儿心脏皮肤综合征',\n",
       " '老年晚年性癫痫',\n",
       " '腹股沟疝气',\n",
       " '新生儿ABO溶血症',\n",
       " '房内阻滞',\n",
       " '老年人夏伊-德雷格综合症',\n",
       " '胫前粘液性水肿',\n",
       " '猪囊虫病',\n",
       " '早期复极综合症',\n",
       " '外伤性窒息综合征',\n",
       " '间脑癫痫',\n",
       " '马鼻疽',\n",
       " '内源性高甘油三酯血症',\n",
       " '妊娠合并肾上腺皮质功能亢进',\n",
       " '颅骨愈合症',\n",
       " '球后视神经炎',\n",
       " '灰色血小板综合征',\n",
       " '急性右心衰竭',\n",
       " '布-马二氏综合征',\n",
       " '阴道内胚层窦瘤',\n",
       " '儿童期巨脑畸形综合征',\n",
       " '庞提阿克热',\n",
       " '脑梗塞',\n",
       " '髓状海绵样肾',\n",
       " '妊娠咳嗽',\n",
       " '牙本质过敏',\n",
       " '先天性高胆红素血症',\n",
       " '日光性皮炎',\n",
       " '多发性易消散性白点综合征',\n",
       " '小儿视觉性言语障碍',\n",
       " '坏死松解性游走性红斑',\n",
       " '空气栓塞症',\n",
       " '儿童水杨酸盐类中毒',\n",
       " '神经系统副肿瘤综合症',\n",
       " '气肿性膀胱炎',\n",
       " '许兰-亨诺氏血管炎',\n",
       " '牛皮癣',\n",
       " '小儿尤文瘤',\n",
       " '甲下骨疣',\n",
       " '产后精神抑郁',\n",
       " '先天性风瘾',\n",
       " '放射菌病',\n",
       " '妊娠剧吐',\n",
       " '肠病后类风湿',\n",
       " '癫痫发作与癫痫综合症',\n",
       " '军团菌肺炎',\n",
       " '西瓜形胃',\n",
       " '股动脉肿瘤',\n",
       " '慢性肺原性心脏病',\n",
       " '虫蚀状皮肤萎缩',\n",
       " '末端主动脉血栓形成综合征',\n",
       " '血管神经肌瘤',\n",
       " '颞下颌关节脱臼',\n",
       " '颌面部恶性肿瘤',\n",
       " '小儿视网膜成神经细胞瘤',\n",
       " '多形渗出性红斑',\n",
       " '儿童丘疹综合征',\n",
       " '非典型性佝偻病',\n",
       " '直肠脱出',\n",
       " '成人变应性亚败血症',\n",
       " '小儿Edwards综合征',\n",
       " '电焊工病',\n",
       " '马-班二氏综合征',\n",
       " '卡他球菌肺炎',\n",
       " '肋间神经炎',\n",
       " '婴儿脑性瘫痪',\n",
       " '埃里希体病',\n",
       " '中渗',\n",
       " '疱疹性脑炎',\n",
       " '小儿变形性肌张力不全',\n",
       " '小儿心柱低排血量综合征',\n",
       " '肾嗜酸性细胞瘤',\n",
       " '高红症',\n",
       " '门-腔分流性脊髓病',\n",
       " '非典型分枝杆菌病',\n",
       " '精子稀薄症',\n",
       " '脑内畸胎瘤',\n",
       " '雅司',\n",
       " '眼弓浆虫病',\n",
       " '旋前圆肌综合症',\n",
       " '苔薛样和斑疹形淀粉样变',\n",
       " '小儿Babinski-Frohlich综合征',\n",
       " '妊娠合并甲状腺功能减退症',\n",
       " '新生儿呼吸暂停症',\n",
       " '夏科氏关节病',\n",
       " '慢性非特异性溃疡性结肠炎',\n",
       " '胃粘膜脱垂症',\n",
       " '砂眼',\n",
       " '小儿感冒',\n",
       " '胃反',\n",
       " '麦芒皮炎',\n",
       " '臭鼻症',\n",
       " '椎管内肿瘤',\n",
       " '化脓性肾炎',\n",
       " '小儿Lowe综合征',\n",
       " '克-纳二氏综合征',\n",
       " '无α脂蛋白血症',\n",
       " '恶性组织细胞瘤',\n",
       " '高血钾型肾小管性酸中毒',\n",
       " '海绵状心肌',\n",
       " '北亚蜱传斑点热',\n",
       " '疱疹性脓疱病',\n",
       " '肠痈',\n",
       " '小儿金葡菌肺炎',\n",
       " '毛囊周围假性胶样痣',\n",
       " '肝癌',\n",
       " '舌咽神经痛性抽搐',\n",
       " '上皮内上皮癌',\n",
       " '藻菌病',\n",
       " '勃起机能障碍',\n",
       " '稻瘟病',\n",
       " 'Osgood-Schlatter病',\n",
       " '外阴汗腺腺瘤',\n",
       " '膈症',\n",
       " '黑素细胞痣',\n",
       " '急性炎症性脱髓鞘性多发性神经病',\n",
       " '口角唇炎',\n",
       " '非伤寒沙门氏菌感染',\n",
       " '急性扁桃体炎',\n",
       " '老年人慢性粒细胞性白血病',\n",
       " 'Rapunzel综合征',\n",
       " '结节性多发性动脉炎性巩膜炎',\n",
       " '变应性小动脉炎',\n",
       " '脑脊膜瘤病',\n",
       " '七日风',\n",
       " '骨骺软骨性巨细胞瘤',\n",
       " '书写麻痹',\n",
       " '神经原发类癌',\n",
       " '卡氏肺囊虫性肺炎',\n",
       " '支气管肺发育不良',\n",
       " '外阴粒细胞性成肌细胞瘤',\n",
       " '肥胖带绦虫病',\n",
       " '小肠变态反应性紫癜',\n",
       " '镰孢霉病',\n",
       " '胃饮',\n",
       " '新生儿脓疮病',\n",
       " '白血病样反应',\n",
       " '虹膜睫状体炎',\n",
       " '异位妊娠',\n",
       " '小儿副神经节瘤',\n",
       " '痱疖',\n",
       " '牵引性骨骺炎',\n",
       " '急性全自主神经病',\n",
       " '心内膜瘤',\n",
       " '结核病',\n",
       " '淋',\n",
       " '小肠结肠炎耶尔森氏菌肺炎',\n",
       " '黑癣',\n",
       " '原发性肾上腺皮质功能不全',\n",
       " '老年人脊椎基底动脉供血不足',\n",
       " '小儿浓眉-小头-短肢综合征',\n",
       " '消化功能紊乱',\n",
       " '无脉症',\n",
       " '疮毒走黄',\n",
       " '蛋白质能量营养不良症',\n",
       " '良性头部组织细胞增多病',\n",
       " '精子减少症',\n",
       " '艾滋病的神经系统表现',\n",
       " 'Ⅱ型RTA',\n",
       " '兔眼性角膜炎',\n",
       " '窦性停顿',\n",
       " '小儿遗尿症',\n",
       " '福克斯-福代斯二氏病',\n",
       " '老年人胃腺癌',\n",
       " '特发性点滴状浅黑变病',\n",
       " '婴儿一过性低丙种球蛋白血症',\n",
       " '成团泛菌感染',\n",
       " '眼-神经-皮肤血管瘤病',\n",
       " '小儿牙本质生长不全综合症',\n",
       " '盆底腹膜膨出',\n",
       " '睫不能移动综合征',\n",
       " '妊娠合并甲旁减',\n",
       " '震颤麻痹',\n",
       " '非典型性纤维黄色瘤',\n",
       " '眼睑皮肤丹毒',\n",
       " '性交恐惧症',\n",
       " '痱毒',\n",
       " 'Morquio氏病',\n",
       " '萎缩性阴道炎',\n",
       " 'Austin型异染性脑白质营养不良',\n",
       " '淋閟',\n",
       " '老年缺血性心肌病',\n",
       " '视网膜静脉闭塞',\n",
       " '疣状表皮结构不良',\n",
       " 'Seigal-Cattan-Mamon综合征',\n",
       " '视网膜静脉阻塞',\n",
       " '贲门痉挛',\n",
       " '干裂疮',\n",
       " '新生儿重复回肠',\n",
       " '心脏结节病',\n",
       " '粒型咽炎',\n",
       " '室性过早搏动',\n",
       " '花粉症',\n",
       " '孤立性毛外根鞘瘤',\n",
       " '小儿桥本甲状腺炎',\n",
       " '小儿Riley-Day综合征',\n",
       " '脚气心',\n",
       " '交媾困难',\n",
       " '出血性卒中',\n",
       " '肠道病原性大肠杆菌感染',\n",
       " '粘多糖症',\n",
       " '内火',\n",
       " '小儿先天性中性粒细胞减少症',\n",
       " '胆管细胞癌',\n",
       " '间歇依赖性多形性室性心动过速',\n",
       " '老年胆囊炎',\n",
       " '粘膜炎莫拉菌感染',\n",
       " '流皮漏',\n",
       " ...]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "alias_vovab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = pd.read_excel('./food-alias.xlsx').fillna('NONE')\n",
    "f_name = f['food'].values.tolist()\n",
    "f_alias = f['alias'].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "f_a_f = []\n",
    "f_d = {}\n",
    "for idx, i in enumerate(f_name):\n",
    "    if f_alias[idx] != 'NONE':\n",
    "        if i not in f_d.keys():\n",
    "            f_d[i] = [f_alias[idx]]\n",
    "        else:\n",
    "            f_d[i].append(f_alias[idx])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "food_alias_add_data = []\n",
    "for key in f_d.keys():\n",
    "    item = f_d[key]\n",
    "    \n",
    "    ali = '、'.join(item)\n",
    "    new_item = []\n",
    "    for idx, i in enumerate(item):\n",
    "        new_item = item[:idx] + ['<e2>'+i+'</e2>'] + item[idx+1:]\n",
    "        ali = '、'.join(new_item)\n",
    "        food_alias_add_data.append('别名'+'\\t'+temp.format('<e1>'+key+'</e1>' , ali))\n",
    "    for idx, i in enumerate(item):\n",
    "        for jdx, j in enumerate(item[idx+1:]):\n",
    "            new_item = item[:idx] + ['<e1>'+i+'</e1>'] + item[idx+1:][:jdx] + ['<e2>'+j+'</e2>'] + item[idx+1:][jdx+1:]\n",
    "            ali = '、'.join(new_item)\n",
    "            food_alias_add_data.append('unknow'+'\\t'+temp.format(key , ali))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8934"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(food_alias_add_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "28917"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(disease_alias_add_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = food_alias_add_data + disease_alias_add_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cal_e1(line):\n",
    "    count = len(line.split('<e1>'))\n",
    "    if count > 2:\n",
    "        return False\n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cal_e1('22<e1>')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./alias_add_data.txt', 'a+') as f:\n",
    "    for i in all_data:\n",
    "        f.writelines(i)\n",
    "        f.writelines('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "food_alias_add_data_a = []\n",
    "food_alias_add_data_u = []\n",
    "for i in food_alias_add_data:\n",
    "    label = i.split('\\t')[0]\n",
    "    if label == '别名':\n",
    "        food_alias_add_data_a.append(i)\n",
    "    else:\n",
    "        food_alias_add_data_u.append(i)\n",
    "\n",
    "disease_alias_add_data_a = []\n",
    "disease_alias_add_data_u = []\n",
    "for i in disease_alias_add_data:\n",
    "    label = i.split('\\t')[0]\n",
    "    if label == '别名':\n",
    "        disease_alias_add_data_a.append(i)\n",
    "    else:\n",
    "        disease_alias_add_data_u.append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11511\n",
      "17406\n",
      "2046\n",
      "6888\n"
     ]
    }
   ],
   "source": [
    "print(len(disease_alias_add_data_a))\n",
    "print(len(disease_alias_add_data_u))\n",
    "print(len(food_alias_add_data_a))\n",
    "print(len(food_alias_add_data_u))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = food_alias_add_data_a + disease_alias_add_data_a\n",
    "random.shuffle(a)\n",
    "u = food_alias_add_data_u + disease_alias_add_data_u\n",
    "random.shuffle(u)\n",
    "\n",
    "a_1 = []\n",
    "u_1 = []\n",
    "\n",
    "for idx, i in enumerate(a):\n",
    "    if not cal_e1(i):\n",
    "        continue\n",
    "    if '<e2>' not in i:\n",
    "        continue\n",
    "    if '综合征又' in i :\n",
    "        continue\n",
    "    a_1.append(i)\n",
    "        \n",
    "for idx, i in enumerate(u):\n",
    "    if not cal_e1(i):\n",
    "        continue\n",
    "    if '<e2>' not in i:\n",
    "        continue\n",
    "    if '综合征又' in i :\n",
    "        continue\n",
    "    u_1.append(i)\n",
    "    \n",
    "a = a_1\n",
    "u = u_1\n",
    "\n",
    "a = a[:1888]\n",
    "a_l = math.floor(0.8 * len(a))\n",
    "u = u[:1222]\n",
    "u_l = math.floor(0.8 * len(u))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = a[:a_l] + u[:u_l]\n",
    "test = a[a_l:] + u[u_l:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "for idx, i in enumerate(train):\n",
    "    if '<e2>' not in i:\n",
    "        print(train[idx])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2487\n",
      "623\n"
     ]
    }
   ],
   "source": [
    "print(len(train))\n",
    "print(len(test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "random.shuffle(train)\n",
    "random.shuffle(test)\n",
    "with open('./alia_train.txt', 'a+') as f:\n",
    "    for i in train:\n",
    "        label = i.split('\\t')\n",
    "        if label[0] == 'unknow' or label[0] == 'unknown':\n",
    "            label[0] = 'unKnown'\n",
    "        i = '\\t'.join(label)\n",
    "        f.writelines(i)\n",
    "        f.writelines('\\n')\n",
    "        \n",
    "with open('./alia_test.txt', 'a+') as f:\n",
    "    for i in test:\n",
    "        label = i.split('\\t')\n",
    "        if label[0] == 'unknow' or label[0] == 'unknown':\n",
    "            label[0] = 'unKnown'\n",
    "        i = '\\t'.join(label)\n",
    "        f.writelines(i)\n",
    "        f.writelines('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = []\n",
    "test_data = []\n",
    "\n",
    "with open('./alia_train.txt', 'r') as f:\n",
    "    for i in f.readlines():\n",
    "        train_data.append(i.strip())\n",
    "\n",
    "with open('./alia_test.txt', 'r') as f:\n",
    "    for i in f.readlines():\n",
    "        test_data.append(i.strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sentence,relation,head,head_type,head_offset,tail,tail_type,tail_offset\n",
    "# 鲜香菇更适合老人小孩吃：鲜香菇相对于干香菇有一种能促进维生素D合成的物质，而且更易于消化，老人孩子要想吃香菇，鲜香菇更适合。,适宜,孩子,crowd,47,维生素D,nutrient,27"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "老年人急性呼吸窘迫综合征又称为老年人急性呼吸窘迫综合症\n",
      "结核性脊髓炎又称为结核性脊膜脊髓炎\n",
      "老年人病态窦房结综合征又称为老年病态窦房结综合征，老年人病窦综合征，老年人病态窦房结综合症\n",
      "脑梗死又称为脑梗塞\n",
      "艾叶又称为艾草、艾、冰台、艾蒿、医草、灸草、蕲艾、黄草、家艾、甜艾、草蓬、艾蓬、狼尾蒿子、香艾、野莲头、阿及艾、陈艾、灰草、大艾叶、杜艾叶、萎蒿\n",
      "核桃又称为核桃\n",
      "知了又称为蚱蝉、油蝉、知了、蜘蟟、叽喳虫\n",
      "肺隔离症又称为肺分离，肺隔离，隔离肺\n",
      "老年人乳腺癌又称为老年人乳癌，老年人乳粟，老年乳腺癌\n",
      "八角又称为舶上茴香、舶茴香、八角珠、八角香、八角、大茴、原油茴、八月珠、大料、五香、八角\n",
      "老年人脑栓塞又称为老年脑栓塞，老年人脑动脉栓塞\n",
      "燕麦又称为燕麦、野麦、雀麦\n",
      "老年人夏伊-德雷格综合征又称为老年Shy-Drager综合征，老年人Shy-Drager综合征，老年人夏-德综合征，老年人夏伊-德雷格综合症\n",
      "老年人真性红细胞增多症又称为老年人真性红细胞增多，老年真性红细胞增多症\n",
      "洋葱又称为洋葱头、玉葱、葱头、圆葱、球葱、葱头\n",
      "老年人帕金森病又称为老年帕金森病，老年人帕金森氏病，老年人震颤麻痹，老年人震颤性假麻痹，颤病，脑风\n",
      "黑加仑子又称为黑加仑子、黑加仑、黑豆果、黑醋栗\n",
      "老年人甲状腺癌又称为老年甲状腺癌，老年人恶性甲状腺肿，老年恶性甲状腺肿\n",
      "老年人心脏传导阻滞又称为老年人传导阻滞，老年人心传导阻滞，老年心脏传导阻滞\n",
      "基底核钙化症又称为基底核钙化，基底节钙化症，基底神经节钙化，特发性基底节钙化\n",
      "乳鸽又称为乳鸽、鹁鸽\n",
      "竹笋又称为笋、毛笋、竹芽、竹萌、笋、毛笋、竹芽、竹萌、笋、毛笋、竹芽、竹萌\n",
      "结肠黑变病又称为结肠黑色素沉着病\n",
      "妊娠合并红细胞增多症又称为妊娠合并红细胞增多\n",
      "红菌菇又称为红菌、红菇\n",
      "老年人脑心综合征又称为老年脑心综合征，老年人脑心综合症\n",
      "低血糖症又称为饥厥，食厥，低血糖\n",
      "竹笋又称为笋、毛笋、竹芽、竹萌、笋、毛笋、竹芽、竹萌、笋、毛笋、竹芽、竹萌\n",
      "垂体危象与垂体卒中又称为垂体危象，垂体卒中\n",
      "抗胰蛋白酶缺乏症又称为抗胰蛋白酶不足，抗胰蛋白酶缺乏\n",
      "芝麻油又称为麻油、芝麻油\n",
      "老年人黏液性瓣膜病又称为老年人粘液性瓣膜病\n",
      "颅内肿瘤伴发的精神障碍又称为颅内肿瘤伴发的精神病，颅内肿瘤伴发的精神错乱，颅内肿瘤伴发的精神紊乱\n",
      "小儿遗传性大疱性表皮松解症又称为小儿遗传性大疱性表皮松解\n",
      "静脉血栓形成又称为静脉血栓\n",
      "老年人室性期前收缩又称为老年人室性过早搏动，老年人室性期外收缩，老年室性早搏\n",
      "原发性小肠淋巴管扩张症又称为原发性小肠淋巴管扩张\n",
      "遗传性乳腺癌-卵巢癌综合征又称为乳腺癌-卵巢癌综合征，遗传性乳腺癌-卵巢癌，遗传性乳腺癌-卵巢癌综合症\n",
      "竹笋又称为笋、毛笋、竹芽、竹萌、笋、毛笋、竹芽、竹萌、笋、毛笋、竹芽、竹萌\n",
      "老年人脂肪肝又称为老年人肝积脂病，老年脂肪肝\n",
      "八角又称为舶上茴香、舶茴香、八角珠、八角香、八角、大茴、原油茴、八月珠、大料、五香、八角\n"
     ]
    }
   ],
   "source": [
    "f = pd.read_excel('./food-alias.xlsx').fillna('NONE')\n",
    "f_name = f['food'].values.tolist()\n",
    "f_alias = f['alias'].values.tolist()\n",
    "\n",
    "f_data = list(set(f_name + f_alias))\n",
    "\n",
    "new_train_data = []\n",
    "new_test_data = []\n",
    "\n",
    "for i in train_data:\n",
    "    line = i\n",
    "    spi = line.split('\\t')\n",
    "    label = spi[0]\n",
    "    e1_idx = spi[1].index('<e1>')\n",
    "    e1_2_idx = spi[1].index('</e1>')\n",
    "    e2_idx = spi[1].index('<e2>')\n",
    "    e2_2_idx = spi[1].index('</e2>')\n",
    "    e1 = spi[1][e1_idx + 4: e1_2_idx]\n",
    "    e2 = spi[1][e2_idx + 4: e2_2_idx]\n",
    "    if 'e1' in e2 or 'e2' in e1:\n",
    "        continue\n",
    "#         print(line)\n",
    "    new_s = spi[1].replace('<e1>', '').replace('</e1>', '').replace('<e2>', '').replace('</e2>', '').replace(',', '，')\n",
    "    if e1 in f_data or e2 in f_data:\n",
    "        e_label = 'food'\n",
    "    else:\n",
    "        e_label = 'disease'\n",
    "        \n",
    "    if new_s.index(e1) == new_s.index(e2) or e1 == e2:\n",
    "        print(new_s)\n",
    "        continue\n",
    "    \n",
    "    new_s = new_s + ',' + label + ',' + e1 + ',' + e_label + ',' + str(new_s.index(e1)) + ',' + e2 + ',' + e_label + ',' + str(new_s.index(e2))\n",
    "    \n",
    "    new_train_data.append(new_s)\n",
    "\n",
    "for i in test_data:\n",
    "    line = i\n",
    "    spi = line.split('\\t')\n",
    "    label = spi[0]\n",
    "    e1_idx = spi[1].index('<e1>')\n",
    "    e1_2_idx = spi[1].index('</e1>')\n",
    "    e2_idx = spi[1].index('<e2>')\n",
    "    e2_2_idx = spi[1].index('</e2>')\n",
    "    e1 = spi[1][e1_idx + 4: e1_2_idx]\n",
    "    e2 = spi[1][e2_idx + 4: e2_2_idx]\n",
    "    if 'e1' in e2 or 'e2' in e1:\n",
    "        continue\n",
    "#         print(line)\n",
    "    new_s = spi[1].replace('<e1>', '').replace('</e1>', '').replace('<e2>', '').replace('</e2>', '').replace(',', '，')\n",
    "    if e1 in f_data or e2 in f_data:\n",
    "        e_label = 'food'\n",
    "    else:\n",
    "        e_label = 'disease'\n",
    "        \n",
    "    if new_s.index(e1) == new_s.index(e2) or e1 == e2:\n",
    "        print(new_s)\n",
    "        continue\n",
    "    \n",
    "    new_s = new_s + ',' + label + ',' + e1 + ',' + e_label + ',' + str(new_s.index(e1)) + ',' + e2 + ',' + e_label + ',' + str(new_s.index(e2))\n",
    "    \n",
    "    new_test_data.append(new_s)\n",
    "\n",
    "with open('./new_train.txt', 'a+') as f1:\n",
    "    for i in new_train_data:\n",
    "        f1.writelines(i)\n",
    "        f1.writelines('\\n')\n",
    "        \n",
    "with open('./new_test.txt', 'a+') as f2:\n",
    "    for i in new_test_data:\n",
    "        f2.writelines(i)\n",
    "        f2.writelines('\\n')\n",
    "#     print(e1_idx)\n",
    "#     print(e1_2_idx)\n",
    "#     print(e2_idx)\n",
    "#     print(e2_2_idx)\n",
    "#     print(e1)\n",
    "#     print(e2)\n",
    "#     print(line.split('\\t'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = train + test\n",
    "for i in c:\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "氏病又称为\n"
     ]
    }
   ],
   "source": [
    "print('\\u6c0f\\u75c5\\u53c8\\u79f0\\u4e3a')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'1122233'.in"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8348"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1671 + 6677"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1888\n",
      "2113\n",
      "1879\n",
      "1420\n",
      "1047\n",
      "8347\n"
     ]
    }
   ],
   "source": [
    "bieming = []\n",
    "shiyi = []\n",
    "buyi = []\n",
    "unk = []\n",
    "hanyou = []\n",
    "all_data = []\n",
    "\n",
    "with open('./count/rbert_train.csv', 'r') as f:\n",
    "    for i in f.readlines():\n",
    "        label = i.split('\\t')[0]\n",
    "        if label == '别名':\n",
    "            bieming.append(label)\n",
    "        elif label == '适宜':\n",
    "            shiyi.append(label)\n",
    "        elif label == '不宜':\n",
    "            buyi.append(label)\n",
    "        elif label == '含有':\n",
    "            hanyou.append(label)\n",
    "        else:\n",
    "            unk.append(label)\n",
    "        all_data.append(label)\n",
    "\n",
    "with open('./count/rbert_test.csv', 'r') as f:\n",
    "    for i in f.readlines():\n",
    "        label = i.split('\\t')[0]\n",
    "        if label == '别名':\n",
    "            bieming.append(label)\n",
    "        elif label == '适宜':\n",
    "            shiyi.append(label)\n",
    "        elif label == '不宜':\n",
    "            buyi.append(label)\n",
    "        elif label == '含有':\n",
    "            hanyou.append(label)\n",
    "        else:\n",
    "            unk.append(label)\n",
    "        all_data.append(label)\n",
    "\n",
    "print(len(bieming))\n",
    "print(len(shiyi))\n",
    "print(len(buyi))\n",
    "print(len(unk))\n",
    "print(len(hanyou))\n",
    "print(len(all_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
